In [1]:
import numpy as np
import pandas as pd

Object Creation

Creating Series: 1D ndarray


In [2]:
print(pd.Series( [1, 5, 2, 3, 6, 4] ) )


0    1
1    5
2    2
3    3
4    6
5    4
dtype: int64

Checking a value in Series index and values


In [3]:
s = pd.Series(range(5), index=list('abcde'))
print('b' in s) # checks index values
print(s.isin([2])) # checks in values


True
a    False
b    False
c     True
d    False
e    False
dtype: bool

In [4]:
print(s)
s = s.reindex(list('afg'))
s # due to NaN, whole series dtype changed to float. To escape from such instances, make the series object dtype initially.


a    0
b    1
c    2
d    3
e    4
dtype: int32
Out[4]:
a    0.0
f    NaN
g    NaN
dtype: float64

In [5]:
s_obj = pd.Series(range(5), index=list('abcde'), dtype=np.object)
print(s_obj)
s_obj = s_obj.reindex(list('afg'))
s_obj # No change in the dtype


a    0
b    1
c    2
d    3
e    4
dtype: object
Out[5]:
a      0
f    NaN
g    NaN
dtype: object

Creating DataFrame


In [6]:
# 1. Using list of dictionary
lst = [{"C1": 1, "C2": 2},
      {"C1": 5, "C2": 10, "C3": 20}]

In [7]:
# Observe NaN       
print(pd.DataFrame(lst, index = ["R1", "R2"]))


    C1  C2    C3
R1   1   2   NaN
R2   5  10  20.0

In [8]:
# 2. Using list (Commonly used)
lst = {"C1": ["1", "3"],
      "C2": ["2","4"]}
print( pd.DataFrame(lst, index = ["R1", "R2"]) )


   C1 C2
R1  1  2
R2  3  4

In [9]:
# Defining Column name within df function
print(pd.DataFrame(np.random.randn(2,2),
                  index = list('pq'),
                   columns = list('ab')))


          a         b
p  0.563066  0.494843
q -0.051123  1.303799

Data types


In [10]:
df = pd.DataFrame({'A': [10., 20.],
                  'B': "text",
                  'C': [2,60],
                  'D': 3+9j})
print(df)


      A     B   C       D
0  10.0  text   2  (3+9j)
1  20.0  text  60  (3+9j)

In [11]:
# Observe final dtype as OBJECT to preserve every
# element's data type information.
print(df.dtypes)


A       float64
B        object
C         int64
D    complex128
dtype: object

In [12]:
print(df.info()) # memory usage doesn't include object dtypes columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
A    2 non-null float64
B    2 non-null object
C    2 non-null int64
D    2 non-null complex128
dtypes: complex128(1), float64(1), int64(1), object(1)
memory usage: 160.0+ bytes
None

In [13]:
df.info(memory_usage='deep') # with deep even object dtypes are included


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
A    2 non-null float64
B    2 non-null object
C    2 non-null int64
D    2 non-null complex128
dtypes: complex128(1), float64(1), int64(1), object(1)
memory usage: 286.0 bytes

Viewing Data


In [14]:
print(pd.DataFrame(np.random.randn(20,2)).head())


          0         1
0  2.076125 -0.371193
1 -0.620316 -0.521640
2 -0.555493 -0.146482
3 -0.614715  1.007617
4  0.219642 -0.311603

In [15]:
print(pd.DataFrame(np.random.randn(20,2)).tail(5))


           0         1
15  0.938823 -1.065879
16  1.284274  0.952037
17  1.633314 -0.249550
18 -0.881654 -1.234461
19  1.162749  1.542099

In [16]:
df = pd.DataFrame({'A': [10., 20.],
                  'B': "text",
                  'C': [2,60],
                  'D': 3+9j}, index = list('PQ'))

In [17]:
print((df.index))   #index of rows


Index(['P', 'Q'], dtype='object')

In [18]:
print(df.columns)   # index of columns


Index(['A', 'B', 'C', 'D'], dtype='object')

In [19]:
print(df.values)    # display values of df


[[10.0 'text' 2 (3+9j)]
 [20.0 'text' 60 (3+9j)]]

DataFrame Attributes


In [20]:
df = pd.DataFrame({'A': [1,4,7],
                   'B': [2,5,8],
                   'C': [3,6,9]},
                   index = list('PQR'))
print(df)


   A  B  C
P  1  2  3
Q  4  5  6
R  7  8  9

In [21]:
# Transpose
print(df.T)


   P  Q  R
A  1  4  7
B  2  5  8
C  3  6  9

In [22]:
# shape, size and dimensions
print(df.shape, df.size, df.ndim)


(3, 3) 9 2

DataFrame Methods


In [23]:
df1 = pd.DataFrame({'A': [1,4,7],
                   'B': [2,5,8],
                   'C': [3,6,9]},
                   index = list('PQR'))
df_temp = df1
df2 = pd.DataFrame({'A': [-1,4,7],
                   'B': [2,-5,8],
                   'C': [3,6,-9]},
                   index = list('PQR'))

df3 = pd.DataFrame({'A': [1,-4,-7],
                   'B': [-2,5,-8],
                   'C': [-3,-6,9]},
                   index = list('STU'))

Addition | Subtraction


In [24]:
print(df1.add(df2))


    A   B   C
P   0   4   6
Q   8   0  12
R  14  16   0

In [25]:
print(df1.sub(df3))


    A   B   C
P NaN NaN NaN
Q NaN NaN NaN
R NaN NaN NaN
S NaN NaN NaN
T NaN NaN NaN
U NaN NaN NaN

Appending 2 dfs


In [26]:
print(df_temp.append(df2))


   A  B  C
P  1  2  3
Q  4  5  6
R  7  8  9
P -1  2  3
Q  4 -5  6
R  7  8 -9

Applying a function


In [27]:
print(df3.apply(np.abs, axis = 1))


   A  B  C
S  1  2  3
T  4  5  6
U  7  8  9

In [28]:
df3


Out[28]:
A B C
S 1 -2 -3
T -4 5 -6
U -7 -8 9

In [29]:
df3.apply(np.sum, axis = 0)


Out[29]:
A   -10
B    -5
C     0
dtype: int64

Converting frame to its numpy array representation


In [30]:
print(type(df3.as_matrix()))


<class 'numpy.ndarray'>

Computing non-nan values


In [31]:
df1


Out[31]:
A B C
P 1 2 3
Q 4 5 6
R 7 8 9

In [32]:
print(df1.count())  #by default -> col


A    3
B    3
C    3
dtype: int64

In [33]:
print(df1.count(axis = 1))  # row


P    3
Q    3
R    3
dtype: int64

In [34]:
print(df1.sub(df3).count())   # nan in all col


A    0
B    0
C    0
dtype: int64

Handling NULL values


In [35]:
df = pd.DataFrame(np.nan,columns = list('AB'),
                 index = list('CD'))
print(df)


    A   B
C NaN NaN
D NaN NaN

In [36]:
print(df.isnull())


      A     B
C  True  True
D  True  True

Filling NaNs


In [37]:
print(df1.sub(df3).fillna(5))


     A    B    C
P  5.0  5.0  5.0
Q  5.0  5.0  5.0
R  5.0  5.0  5.0
S  5.0  5.0  5.0
T  5.0  5.0  5.0
U  5.0  5.0  5.0

Drop NaNs


In [38]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0], 
                  [3, 4, np.nan, 1],
                  [np.nan, np.nan, np.nan, 5]],
                  columns=list('ABCD'))
print(df)


     A    B   C  D
0  NaN  2.0 NaN  0
1  3.0  4.0 NaN  1
2  NaN  NaN NaN  5

In [39]:
print(df.dropna(axis = 1, how = 'all'))


     A    B  D
0  NaN  2.0  0
1  3.0  4.0  1
2  NaN  NaN  5

In [40]:
print(df.dropna(axis = 1, how='any'))


   D
0  0
1  1
2  5

Handling Duplicates


In [41]:
df = pd.DataFrame([[1, 2, 3], 
                  [1, 2, 3], 
                  [4, 5, 6],
                  [1, 2, 3]], columns =list('ABC'))
# representing which rows are duplicated
print(df.duplicated())


0    False
1     True
2    False
3     True
dtype: bool

Computing PRESENCE of VALUES in a df


In [42]:
df = pd.DataFrame([["hi", 2, 3], 
                  [1, 20, 3+3j], 
                  [4, "world", 6]], 
                   columns =list('ABC'))
print(df)


    A      B       C
0  hi      2  (3+0j)
1   1     20  (3+3j)
2   4  world  (6+0j)

In [43]:
print(df.isin( [20, 6+0j, 'hi'] ))


       A      B      C
0   True  False  False
1  False   True  False
2  False  False   True

Selection

Selection by Label


In [44]:
df = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]], 
                  index = list('ABC'),
                  columns = list('DE'))
print(df)


    D   E
A  15  12
B  33  54
C  10  32

In [45]:
print(df.loc[['A','C'],:])


    D   E
A  15  12
C  10  32

In [46]:
print(df.loc[:,'E'])


A    12
B    54
C    32
Name: E, dtype: int64

In [47]:
print(df.loc['B', 'D'])


33

Selection by Position


In [48]:
df = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]])

print(df)


    0   1
0  15  12
1  33  54
2  10  32

In [49]:
print(df.iloc[[0,2],:])


    0   1
0  15  12
2  10  32

In [50]:
print(df.iloc[:,0:1])


    0
0  15
1  33
2  10

In [51]:
print(df.iloc[:,:])


    0   1
0  15  12
1  33  54
2  10  32

RegEx


In [52]:
df = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]], 
                  index = ['one','two','three'],
                  columns = ['col1', 'col2'])

print(df)


       col1  col2
one      15    12
two      33    54
three    10    32

In [53]:
print(df.filter(regex = 'e$', axis = 0))


       col1  col2
one      15    12
three    10    32

In [54]:
print(df.filter(regex = '^c', axis = 1))


       col1  col2
one      15    12
two      33    54
three    10    32

Boolean indexing


In [55]:
df = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]])

df[df < 15] = 10
df


Out[55]:
0 1
0 15 10
1 33 54
2 10 32

Operations


In [56]:
df1 = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]],
                  columns = list('AB')) 
df2 = pd.DataFrame([[15, 12, -3],
                  [33, 54, 21],
                  [10, 32, 22]],
                  columns = list('ABC'))
df3 = pd.DataFrame([[10, 1, 3],
                  [33, -54, 2],
                  [10, 0.32, 2]],
                  columns = list('ABC'))

print(df1)


    A   B
0  15  12
1  33  54
2  10  32

In [57]:
print(df1.eval('B - A'))


0    -3
1    21
2    22
dtype: int64

In [58]:
df1.eval('C = B - A', inplace=False)   # < C = > is optional    |  inplace=True doesn't return output, rather changes orig.


Out[58]:
A B C
0 15 12 -3
1 33 54 21
2 10 32 22

In [59]:
loc_var = np.arange(3)
df1.eval('C = A + @loc_var', inplace=False)   # @ allows to use external variables


Out[59]:
A B C
0 15 12 15
1 33 54 34
2 10 32 12

In [60]:
print(df1)


    A   B
0  15  12
1  33  54
2  10  32

Comparison Operations


In [61]:
print(df1.equals(df2))  # On broader scale


False

In [62]:
print(df2.eq(df3))


       A      B      C
0  False  False  False
1   True  False  False
2   True  False  False

In [63]:
print(df2.le(df3))


       A      B      C
0  False  False   True
1   True  False  False
2   True  False  False

In [64]:
print(df2.ne(df3))


       A     B     C
0   True  True  True
1  False  True  True
2  False  True  True

Merging dfs

Concatenating at bottom


In [65]:
print(pd.concat( [df2,df3], axis = 0 ))


    A      B   C
0  15  12.00  -3
1  33  54.00  21
2  10  32.00  22
0  10   1.00   3
1  33 -54.00   2
2  10   0.32   2

Concatenating at side


In [66]:
print(pd.concat( [df2,df3], axis = 1 ))


    A   B   C   A      B  C
0  15  12  -3  10   1.00  3
1  33  54  21  33 -54.00  2
2  10  32  22  10   0.32  2

Reshaping & Pivot tables

Pivot tables


In [67]:
df = pd.DataFrame([
           ['PID0', 'Gold', '1$', '2£'],
           ['PID0', 'Bronze', '2$', '4£'],
           ['PID1', 'Gold', '3$', '6£'],
           ['PID1', 'Silver', '4$', '8£'],
           ], columns = ['PID', 'CType', 
               'USD', 'PND'])
                 
print(df)


    PID   CType USD PND
0  PID0    Gold  1$  2£
1  PID0  Bronze  2$  4£
2  PID1    Gold  3$  6£
3  PID1  Silver  4$  8£

In [68]:
print(df.pivot(index = 'PID', columns = 'CType'))


         USD                PND            
CType Bronze Gold Silver Bronze Gold Silver
PID                                        
PID0      2$   1$   None     4£   2£   None
PID1    None   3$     4$   None   6£     8£

In [69]:
print(df.pivot(index = 'PID', columns = 'CType',
              values = 'USD'))


CType Bronze Gold Silver
PID                     
PID0      2$   1$   None
PID1    None   3$     4$

In [70]:
df = pd.DataFrame([
           ['PID0', 'Gold', '1$'],
           ['PID0', 'Bronze', '2$'],
           ['PID0', 'Gold', '3$'],     #Duplicate
           ['PID1', 'Silver', '4$'],
           ], columns = ['PID', 'CType', 
               'USD'])

In [71]:
# Ambiguity in saving value to PID0_Gold 
# on values 1$ and 3$

print(df)


    PID   CType USD
0  PID0    Gold  1$
1  PID0  Bronze  2$
2  PID0    Gold  3$
3  PID1  Silver  4$

In [72]:
print(df.pivot_table(index = 'PID', 
                    columns = 'CType',
                    aggfunc = np.max))


         USD             
CType Bronze  Gold Silver
PID                      
PID0      2$    3$   None
PID1    None  None     4$

Reshaping


In [73]:
multi_ind = pd.MultiIndex.from_tuples(
                                     [('IND','R1'),
                                      ('IND','R2'),
                                      ('US','R1'),
                                      ('US','R2')],
                                      names = [
                                               '1st',
                                               '2nd'])
df = pd.DataFrame(np.random.randn(4,2),
                 index = multi_ind,
                 columns = ['C1', 'C2'])

print(df)


               C1        C2
1st 2nd                    
IND R1   0.609643  0.767526
    R2   0.212994  0.589823
US  R1  -0.745894 -0.024052
    R2  -0.435979 -0.980595

In [74]:
print(df.stack())   # increase in length


1st  2nd    
IND  R1   C1    0.609643
          C2    0.767526
     R2   C1    0.212994
          C2    0.589823
US   R1   C1   -0.745894
          C2   -0.024052
     R2   C1   -0.435979
          C2   -0.980595
dtype: float64

In [75]:
print(df.unstack()) # increase in width


           C1                  C2          
2nd        R1        R2        R1        R2
1st                                        
IND  0.609643  0.212994  0.767526  0.589823
US  -0.745894 -0.435979 -0.024052 -0.980595

Grouping


In [76]:
df = pd.DataFrame([["B", 1],
                  ["B", 2],
                  ["A", 3],
                  ["A", 4]], columns = list('XY'))

print(df)


   X  Y
0  B  1
1  B  2
2  A  3
3  A  4

In [77]:
print(df.groupby(['X']).sum())


   Y
X   
A  7
B  3

In [78]:
print(df.groupby(['X'], sort = False).sum())


   Y
X   
B  3
A  7

In [79]:
df = pd.DataFrame({'A' : list('ppppqqqq'),
                  'B' : list('rrssrrss'),
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)})

print(df)


   A  B         C         D
0  p  r -0.035902 -1.610307
1  p  r  0.817545  1.032971
2  p  s  0.874735 -0.811189
3  p  s -0.480971 -1.050927
4  q  r  0.724361  1.857076
5  q  r -0.093597 -0.479472
6  q  s -0.199158 -0.220293
7  q  s -0.082610 -0.028525

In [80]:
print(df.groupby(['A', 'B']).agg(
                {'C': {'C_mean': 'mean'},
                 'D': {'D_median': 'median'}
                  }))


            C         D
       C_mean  D_median
A B                    
p r  0.390822 -0.288668
  s  0.196882 -0.931058
q r  0.315382  0.688802
  s -0.140884 -0.124409
C:\Anaconda\lib\site-packages\pandas\core\groupby.py:3961: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version
  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)

Import and Export


In [81]:
df = pd.DataFrame([[11, 202],
                  [33, 44]],
                  index = list('AB'),
                   columns = list('CD'))

Writing to excel file


In [82]:
df.to_excel('pd_df.xlsx', sheet_name = 'Sheet1')

Reading from excel file


In [83]:
print(pd.read_excel('pd_df.xlsx', 'Sheet1'))


    C    D
A  11  202
B  33   44